Prediction Challenge - Germany

Author

Jonathan Kennel

Lagged linear regression model

This model can be interpreted and refined based on the responses of each regressor group. It will not be the most accurate predictor, but it is fast and can help identify the most important components.

Set-up

start_time <- Sys.time()
# Load helper packages
library(data.table)
library(plotly)
library(tidymodels)
library(hydrorecipes)
library(dlnm)
library(knitr)

# new names for predictors
nms_other <- c('datetime',
               'precipitation',
               'temperature_mean',
               'temperature_min',
               'temperature_max',
               'sea_pressure',
               'humidity',
               'wind',
               'insolation',
               'evapotranspiration')

Prepare data

outcome    <- fread('../../data/Germany/heads.csv')
predictors <- fread('../../data/Germany/input_data.csv')

# make names more verbose
setnames(outcome, c('datetime', 'wl'))
setnames(predictors, nms_other)

# join data and make a numeric time column
dat <- outcome[predictors, on = 'datetime']

# join data and make a numeric time column
dat <- outcome[predictors, on = 'datetime']

# ad hoc estimate of water deficit. Use distributed lag model for predictors
dat[, deficit := cumsum(scale(precipitation)) - cumsum(scale(evapotranspiration))]
dat[, deficit := lm(deficit~splines::ns(datetime, df = 18))$residuals]
varknots <- dlnm::equalknots(dat$deficit, fun = "bs", degree = 3, df = 5)
lagknots <- logknots(365 * 3, 17)
cb <- crossbasis(dat$deficit, lag = 365 * 3,
                 argvar = list(fun = "bs", knots = varknots),
                 arglag = list(knots = lagknots))
dat <- cbind(dat, cb)

# date predictors
dat[, dow := lubridate::wday(datetime, label = FALSE)]
dat[, doy := lubridate::yday(datetime)]
dat[, mon := lubridate::month(datetime)]

# scaled precipitation based on ET
dat[, precip_evapo := precipitation * 1.0 / evapotranspiration]

# ad hoc snow melt
dat[, min_temp_diff := c(0, diff(temperature_min, lag = 1))]
dat[, snow_melt := 0]
dat[min_temp_diff >= 7  & month(datetime) %in% 1:3, snow_melt := 1]
dat[, snow_melt := pmin(snow_melt * precipitation, 1)]

# create feature dataset
all <- recipe(wl~., dat) |>
  step_interact(terms = ~ precipitation:evapotranspiration) |>
  step_interact(terms = ~ precipitation:temperature_mean) |>
  step_distributed_lag(sea_pressure,     knots = log_lags(5, 15)) |>
  step_distributed_lag(precipitation,    knots = log_lags(30, 180)) |>
  step_distributed_lag(precip_evapo,     knots = log_lags(21, 120)) |>
  step_distributed_lag(snow_melt, knots = log_lags(35, 90)) |>
  step_ns(humidity, deg_free = 20) |>
  step_ns(wind, deg_free = 8) |>
  step_ns(doy, deg_free = 13) |>
  step_ns(mon, deg_free = 6) |>
  step_ns(dow, deg_free = 4) |>
  step_rm(datetime) |>
  step_corr(all_predictors()) |>
  prep() |>
  bake(new_data = NULL)

setDT(all)

Fit model

fit <- lm(wl~., all)

Make predictions

dat <- cbind(dat, predict(fit, all, interval = "prediction"))

Plot results

A median filter to smooth the results as they appeared to have too much variance.

p1 <- plot_ly(dat, x = ~datetime, 
              y = ~wl, 
              type = "scatter", 
              mode = "lines", 
              name = "Water Level",
              line = list(color = "#808080")) |>
  add_lines(x = ~datetime, y = ~runmed(fit, 7), name = "Predictions" ,
            line = list(color = "#6000FF60"))

p2 <- plot_ly(dat, x = ~datetime, 
              y = ~wl - fit, 
              type = "scatter", 
              mode = "lines", 
              name = "Residuals",
              line = list(color = "#808080")) 
subplot(p1, p2, shareX = TRUE, nrows = 2)

Output submission

submission_times <- fread("submission_form_Germany.csv")
submission <- dat[datetime %in% submission_times$Date]

submission <- submission[, list(Date = datetime,
                         `Simulated Head` = fit,
                         `95% Lower Bound` = lwr,
                         `95% Upper Bound` = upr)]
fwrite(submission, "submission_form_Germany.csv")

end_time <- Sys.time()

Timings

Total elapsed time is 5.5 seconds.

Computer and software specs

Macbook Air M1 2020

16 GB Ram

sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.1

Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] knitr_1.41            dlnm_2.4.7            hydrorecipes_0.0.5   
 [4] yardstick_1.1.0       workflowsets_1.0.0    workflows_1.1.2      
 [7] tune_1.0.1            tidyr_1.2.1           tibble_3.1.8         
[10] rsample_1.1.1         recipes_1.0.3         purrr_1.0.0          
[13] parsnip_1.0.3         modeldata_1.0.1       infer_1.0.4          
[16] dplyr_1.0.10          dials_1.1.0           scales_1.2.1         
[19] broom_1.0.2           tidymodels_1.0.0.9000 plotly_4.10.1        
[22] ggplot2_3.4.0         data.table_1.14.6    

loaded via a namespace (and not attached):
 [1] nlme_3.1-161        lubridate_1.9.0     DiceDesign_1.9     
 [4] httr_1.4.4          tools_4.2.1         backports_1.4.1    
 [7] utf8_1.2.2          R6_2.5.1            rpart_4.1.19       
[10] mgcv_1.8-41         DBI_1.1.3           lazyeval_0.2.2     
[13] colorspace_2.0-3    nnet_7.3-18         withr_2.5.0        
[16] tidyselect_1.2.0    compiler_4.2.1      cli_3.5.0          
[19] stringr_1.5.0       digest_0.6.31       rmarkdown_2.19     
[22] pkgconfig_2.0.3     htmltools_0.5.4     parallelly_1.33.0  
[25] lhs_1.1.6           fastmap_1.1.0       collapse_1.8.9     
[28] htmlwidgets_1.6.0   rlang_1.0.6         rstudioapi_0.14    
[31] generics_0.1.3      jsonlite_1.8.4      crosstalk_1.2.0    
[34] magrittr_2.0.3      Matrix_1.5-3        Rcpp_1.0.9         
[37] munsell_0.5.0       fansi_1.0.3         GPfit_1.0-8        
[40] tsModel_0.6-1       lifecycle_1.0.3     furrr_0.3.1        
[43] stringi_1.7.8       yaml_2.3.6          MASS_7.3-58.1      
[46] grid_4.2.1          parallel_4.2.1      listenv_0.9.0      
[49] lattice_0.20-45     earthtide_0.0.14    splines_4.2.1      
[52] pillar_1.8.1        fftw_1.0-7          future.apply_1.10.0
[55] codetools_0.2-18    glue_1.6.2          evaluate_0.19      
[58] RcppParallel_5.1.5  vctrs_0.5.1         foreach_1.5.2      
[61] gtable_0.3.1        future_1.30.0       assertthat_0.2.1   
[64] xfun_0.36           gower_1.0.1         prodlim_2019.11.13 
[67] class_7.3-20        survival_3.4-0      viridisLite_0.4.1  
[70] timeDate_4021.107   iterators_1.0.14    hardhat_1.2.0      
[73] lava_1.7.0          timechange_0.1.1    globals_0.16.2     
[76] ellipsis_0.3.2      ipred_0.9-13